import os
from pathlib import Path

import pdfplumber
from docx import Document
from openpyxl.reader.excel import load_workbook

from base.RAG_da_bang.re_utils import re_anonymize_all



def scan_files(root_dir, extensions=None, max_depth=None):
    """
    递归扫描多层子目录中的文件
    :param root_dir: 根目录路径
    :param extensions: 允许的文件扩展名列表(如['.txt', '.csv'])，None表示所有文件
    :param max_depth: 最大扫描深度，None表示无限制
    :return: 文件路径生成器
    """
    root_path = Path(root_dir)
    if not root_path.exists():
        raise FileNotFoundError(f"目录不存在: {root_dir}")

    for item in root_path.rglob('*'):
        if not item.is_file():
            continue

        # 检查文件扩展名
        if extensions and item.suffix.lower() not in [ext.lower() for ext in extensions]:
            continue

        # 检查目录深度
        if max_depth is not None:
            depth = len(item.relative_to(root_path).parts) - 1
            if depth > max_depth:
                continue
        yield item.absolute()

from pptx import Presentation

def scan_ppt(pdf_path):
    # 打开PPTX文件
    prs = Presentation(pdf_path)

    # 遍历所有幻灯片
    for slide in prs.slides:
        # 读取幻灯片的标题（如果有的话）
        for shape in slide.shapes:
            if not shape.has_text_frame:
                continue
            print(shape.text_frame.text)
            # for paragraph in shape.text_frame.paragraphs:
            #     # print(paragraph.text)
            #     paragraph_tex = paragraph.text
            #     print(paragraph_tex)
            #     re_text = re_anonymize_all(paragraph_tex)
            #     if str(paragraph_tex) != re_text:
            #         # print('有敏感信息')
            #         print(paragraph_tex)
            #         bool_flag = True


if __name__ == '__main__':
    try:
        dir_path = r'D:\工作\IT公司RAG打榜比赛营销相关数据需求\china_mobile_docs'
        exts = '.pptx'
        extensions = [ext.strip() for ext in exts.split(',')] if exts else None
        print(extensions)

        print("\n扫描结果:")
        for idx, file in enumerate(scan_files(dir_path, extensions, max_depth=5), 1):
            print(f"{idx:>4}. {file}")
            try:
                scan_ppt(file)
            except Exception as e:
                print(f"错误: {e}")
        # scan_ppt(r'D:\工作\IT公司RAG打榜比赛营销相关数据需求\china_mobile_docs\宁夏\全球通产品说明书-下发版250303.pptx')
    except Exception as e:
        print(f"错误: {e}")
